HMR region
library(Rsamtools)
## Loading required package: GenomeInfoDb
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
# file below is the mod_mappings.bam file on Savio, sorted using 'samtools sort'
bamPath <- "~/data/molly/mod_mappings.sorted.bam"
bamFile <- BamFile(bamPath)
bamFile
## class: BamFile
## path: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam
## index: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam.bai
## isOpen: FALSE
## yieldSize: NA
## obeyQname: FALSE
## asMates: FALSE
## qnamePrefixEnd: NA
## qnameSuffixStart: NA
scanBamHeader(bamFile)
## $targets
## I II III IV IX MT V VI VII VIII
## 230218 813184 316620 1531933 439888 85779 576874 270161 1090940 562643
## X XI XII XIII XIV XV XVI
## 745751 666816 1078177 924431 784333 1091291 948066
##
## $text
## $text$`@HD`
## [1] "VN:1.4" "SO:coordinate"
##
## $text$`@SQ`
## [1] "SN:I" "LN:230218"
##
## $text$`@SQ`
## [1] "SN:II" "LN:813184"
##
## $text$`@SQ`
## [1] "SN:III" "LN:316620"
##
## $text$`@SQ`
## [1] "SN:IV" "LN:1531933"
##
## $text$`@SQ`
## [1] "SN:IX" "LN:439888"
##
## $text$`@SQ`
## [1] "SN:MT" "LN:85779"
##
## $text$`@SQ`
## [1] "SN:V" "LN:576874"
##
## $text$`@SQ`
## [1] "SN:VI" "LN:270161"
##
## $text$`@SQ`
## [1] "SN:VII" "LN:1090940"
##
## $text$`@SQ`
## [1] "SN:VIII" "LN:562643"
##
## $text$`@SQ`
## [1] "SN:X" "LN:745751"
##
## $text$`@SQ`
## [1] "SN:XI" "LN:666816"
##
## $text$`@SQ`
## [1] "SN:XII" "LN:1078177"
##
## $text$`@SQ`
## [1] "SN:XIII" "LN:924431"
##
## $text$`@SQ`
## [1] "SN:XIV" "LN:784333"
##
## $text$`@SQ`
## [1] "SN:XV" "LN:1091291"
##
## $text$`@SQ`
## [1] "SN:XVI" "LN:948066"
##
## $text$`@RG`
## [1] "ID:1" "SM:SAMPLE"
seqinfo(bamFile)
## Seqinfo object with 17 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## I 230218 <NA> <NA>
## II 813184 <NA> <NA>
## III 316620 <NA> <NA>
## IV 1531933 <NA> <NA>
## IX 439888 <NA> <NA>
## ... ... ... ...
## XII 1078177 <NA> <NA>
## XIII 924431 <NA> <NA>
## XIV 784333 <NA> <NA>
## XV 1091291 <NA> <NA>
## XVI 948066 <NA> <NA>
grHMR <- GRanges(seqnames = "III",
ranges = IRanges(start = 285e3, end = 300e3))
sbp <- ScanBamParam(which = grHMR, what = c("qname", # read ID
"seq", # sequence
"pos")) # read position (start)
aln <- scanBam(bamFile, param = sbp)
customParam <- ScanBamParam(which = grHMR,
tag=c("Mm", "Ml"),
what=c("flag", #custom tag (methylation)
"qname", # read ID
"seq", # sequence
"pos")) #start position
aln <- scanBam(bamFile, which=grHMR, param = customParam)
bases <- aln$`III:285000-300000`$seq
methylTag <- aln$`III:285000-300000`$tag$Mm
methylProb <- aln$`III:285000-300000`$tag$Ml
# methylation probabilities do seem to be very much read dependent
rafalib::mypar(mfrow=c(4,4))
hlp <- lapply(methylProb[1:112], hist, breaks=40, xlim=c(0,255))







# See https://github.com/jkbonfield/hts-specs/blob/methylation/SAMtags.tex
# for explanation on how the Mm and Ml tags work.
posProbList <- list()
for(rr in 1:length(aln$`III:285000-300000`$seq)){
# loop over each read
curSeq <- aln$`III:285000-300000`$seq[rr][[1]]
curStart <- aln$`III:285000-300000`$pos[rr]
curTag <- aln$`III:285000-300000`$tag$Mm[rr]
curProb <- aln$`III:285000-300000`$tag$Ml[[rr]]
# extract A bases and corresponding positions
# start at starting position, and total number of positions equal total number of As in sequence.
posA <- curStart + which(isMatchingAt("A", curSeq, at = 1:length(curSeq))) - 1
# get at which As are considered possibly methylated
tagSplit <- gsub(pattern=";", x=strsplit(curTag, split=",")[[1]], replacement="")
stopifnot(tagSplit[1] == "A+a") # check if we're looking at m6A
tagSplit <- as.numeric(tagSplit[-1])
# number of As considered for methylation should be equal to length of probability vector
stopifnot((sum(tagSplit == 0) + sum(tagSplit > 0)) == length(curProb))
#get pos and prob of As considered potentially methylated
posMet <- posA[cumsum(tagSplit+1)]
probMet <- curProb / 255
df <- data.frame(pos = posMet,
prob = probMet)
# #not considered methylated: doesn't work yet
# posNull <- posA[-cumsum(tagSplit+1)]
# probNull <- rep(0, length(posNull))
# df <- data.frame(pos = c(posMet, posNull),
# prob = c(probMet, probNull)) # positions and methyl probs
posProbList[[rr]] <- df
}
rafalib::mypar(mfrow=c(1,1))
pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
y=1:length(posProbList),
type = "n",
xlab = "Position on ChrIII",
ylab= "Read colored by methylation probability")
for(ii in 1:length(posProbList)){
curPos <- posProbList[[ii]]$pos
points(x=curPos, y=rep(ii, length(curPos)), col=pal[posProbList[[ii]]$prob*255],
pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)

## setting a threshold on methylation
rafalib::mypar(mfrow=c(1,1))
#pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
y=1:length(posProbList),
type = "n",
xlab = "Position on ChrIII",
ylab= "Read colored by methylation probability",
main = "75% Methylation probability threshold")
for(ii in 1:length(posProbList)){
curDf <- posProbList[[ii]]
curDf <- curDf[curDf$prob > .75,]
curPos <- curDf$pos
points(x=curPos, y=rep(ii, length(curPos)),
pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)

rafalib::mypar(mfrow=c(1,1))
#pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
y=1:length(posProbList),
type = "n",
xlab = "Position on ChrIII",
ylab= "Read colored by methylation probability",
main = "95% Methylation probability threshold")
for(ii in 1:length(posProbList)){
curDf <- posProbList[[ii]]
curDf <- curDf[curDf$prob > .95,]
curPos <- curDf$pos
points(x=curPos, y=rep(ii, length(curPos)),
pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)
